1 STRATAA microbiome analysis

1.1 Imports

library(rbiom)
library(kableExtra)
library(ggplot2)
library(ggpubr)
library(ggforce)
library(patchwork)
library(vegan)
library(dplyr)
library(forcats)

1.2 Sources

The file handles are set in config.R as they’re used by both this script and data_cleaning.

source("/Users/flashton/Dropbox/GordonGroup/STRATAA_Microbiome/from_Leo/Leonardos_analysis/bin/core_functions.R")
source("/Users/flashton/Dropbox/GordonGroup/STRATAA_Microbiome/from_Leo/Leonardos_analysis/bin/config.R")

1.3 Metadata basics

First do the plots with kruskall wallis comparison between groups to see if there’s an overall difference, then do the pairwise tests because KW says there is a difference.

metadata <- read_metadata(metadata_handle)

metadata <- metadata %>% mutate(Group = if_else(Group == 'Control_HealthySerosurvey', 'Healthy Control', Group))
metadata <- metadata %>% mutate(Group = if_else(Group == 'Acute_Typhi', 'Acute typhoid', Group))
number_per_country <- metadata %>% group_by(Country) %>% summarise(count = n())

number_per_country %>% kbl() %>% kable_styling()
Country count
Bangladesh 120
Malawi 114
Nepal 76
print(sum(number_per_country$count))
## [1] 310
metadata %>% group_by(Country, Group) %>% summarise(count = n()) %>% kbl() %>% kable_styling()
Country Group count
Bangladesh Acute typhoid 40
Bangladesh Carrier 40
Bangladesh Healthy Control 40
Malawi Acute typhoid 34
Malawi Carrier 40
Malawi Healthy Control 40
Nepal Acute typhoid 29
Nepal Carrier 30
Nepal Healthy Control 17
metadata %>% group_by(Group) %>% summarise(count = n()) %>% kbl() %>% kable_styling()
Group count
Acute typhoid 103
Carrier 110
Healthy Control 97
baseline_chars <- get_baseline_characteristics(metadata)

# baseline_chars$pct_anti
# baseline_chars$pct_anti %>% na_if(0)


# BEWARE - na_if(0.0) will convert ALL 0s to NAs, this is ok as they're only in the antibiotics of carriers and controls
# at the moment, but if others get added in, will screw with it.
# baseline_chars %>% rename(`Median age` = median_age, `Women (%)` = pct_fem, `Antibiotics in last 2 weeks (%)` = pct_anti) %>% pivot_longer(!c(Country, Group)) %>% rename(variable_name = name) %>% pivot_wider(names_from = Group, values_from = value) %>% filter(variable_name != 'number') %>% na_if(0.0) %>% kbl(digits = c(NA, NA, 1, 1, 1)) %>% kable_styling()

# na_if stopped working, so had to do this instead
baseline_chars_table <- baseline_chars %>% rename(`Median age` = median_age, `Women (%)` = pct_fem, `Antibiotics in last 2 weeks (%)` = pct_anti) %>% pivot_longer(!c(Country, Group)) %>% rename(variable_name = name) %>% pivot_wider(names_from = Group, values_from = value) %>% filter(variable_name != 'number') 

baseline_chars_table$Carrier <- replace(baseline_chars_table$Carrier,which(baseline_chars_table$Carrier==0),NA)
baseline_chars_table$`Healthy Control` <- replace(baseline_chars_table$`Healthy Control`,which(baseline_chars_table$`Healthy Control`==0),NA)
baseline_chars_table %>% kbl(digits = c(NA, NA, 1, 1, 1)) %>% kable_styling()
Country variable_name Acute typhoid Carrier Healthy Control
Bangladesh Median age 6.0 37.0 28.5
Bangladesh Women (%) 47.5 47.5 65.0
Bangladesh Antibiotics in last 2 weeks (%) 37.5 NA NA
Malawi Median age 9.7 32.0 24.0
Malawi Women (%) 64.7 57.5 65.0
Malawi Antibiotics in last 2 weeks (%) 26.5 NA NA
Nepal Median age 17.2 43.9 35.0
Nepal Women (%) 24.1 66.7 82.4
Nepal Antibiotics in last 2 weeks (%) 44.8 NA NA
# the kruskal wallis test is positive, showing a difference between the groups, so we move onto pairwise tests.
# ggplot(metadata, aes(x = Country, y = Age, fill = Group)) + geom_boxplot() + stat_compare_means(method = 'kruskal.test', label = "p")
comparisons_groups <- list(c("Acute typhoid", "Carrier"), c("Acute typhoid", "Healthy Control"), c("Carrier", "Healthy Control"))
comparisons_countries <- list(c('Bangladesh', 'Malawi'), c('Bangladesh', 'Nepal'), c('Malawi', 'Nepal'))
# default stat method when doing pairwise tests in wilcoxon.
ggboxplot(metadata, facet.by = "Country", y = "Age", x = "Group", color = "Group") + stat_compare_means(comparisons = comparisons_groups) + rremove("x.text") + rremove("xlab") + rremove("x.ticks") # +rotate_x_text(angle = 45)

ggboxplot(metadata, facet.by = "Group", y = "Age", x = "Country", color = "Country") + stat_compare_means(comparisons = comparisons_countries) + rotate_x_text(angle = 45)

country_group_sex <- metadata %>% group_by(Country, Group, Sex) %>% summarise(count = n()) 

plot_sex <- function(eg1, c){
  d <- eg1 %>% filter(Country == c)
  p <- ggplot(d, aes(x = Group, y = count, fill = Sex)) + 
    geom_bar(stat ='identity', position = 'fill') + 
    ylab('Proportion') + 
    ggtitle(c)# +
    #theme(axis.text=element_text(size=34), axis.title=element_text(size=36,face="bold"), plot.title = element_text(size = 40, face = "bold"), legend.key.size = unit(4, 'cm'), legend.title = element_text(size = 34), legend.text = element_text(size = 28))
  
  return(p)
}

m_sex <- plot_sex(country_group_sex, 'Malawi')
b_sex <- plot_sex(country_group_sex, 'Bangladesh')
n_sex <- plot_sex(country_group_sex, 'Nepal')
m_sex / b_sex / n_sex 

1.4 read in metaphlan data

strataa_metaphlan_data <- read.csv(file = file.path(metaphlan_input_folder, '2023.05.11.all_strataa_metaphlan.csv'), header= TRUE, sep = ",", row.names = 1, stringsAsFactors = FALSE, check.names=FALSE)
strataa_metaphlan_data$lowest_taxonomic_level <- sapply(str_split(row.names(strataa_metaphlan_data), "\\|"), function(x) x[length(x)])

strataa_metaphlan_metadata <- read.csv(file = file.path(metaphlan_input_folder, '2023.05.11.strataa_metadata.metaphlan.csv'), header = TRUE, sep = ",", row.names = 1, stringsAsFactors = FALSE)
strataa_metaphlan_metadata <- strataa_metaphlan_metadata %>% mutate(SampleID = rownames(strataa_metaphlan_metadata))
strataa_metaphlan_metadata <- strataa_metaphlan_metadata %>% mutate(age_bracket=cut(Age, breaks=c(0, 1, 5, 15, Inf), labels=c("<1", "1-5", "6-15", ">15")))

1.5 all countries, all groups

1.5.1 Alpha diversity - metaphlan

Alpha diversity - all countries, all groups

all_countries_all_groups_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Bangladesh', 'Malawi', 'Nepal'), groups_of_interest = c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'), comparisons = list(c('Acute_Typhi', 'Carrier'), c('Acute_Typhi', 'Control_HealthySerosurvey'), c('Control_HealthySerosurvey', 'Carrier')))

all_countries_all_groups_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
rownames(alpha_anova_summary[[1]]) Df Sum.Sq Mean.Sq F.value Pr..F. is_it_significant
Country:Group 4 7.82 1.96 11.7 9.31e-09 significant
Group 2 5.92 2.96 17.6 6.23e-08 significant
Country 2 2.75 1.38 8.2 0.000348 significant
Country:Sex 2 1.15 0.575 3.42 0.034 not_significant
Sex:Group 2 1.08 0.539 3.21 0.0417 not_significant
Country:Sex:Age 2 1.07 0.534 3.18 0.0431 not_significant
Country:Age 2 0.61 0.305 1.82 0.164 not_significant
Country:Sex:Group 4 1.08 0.27 1.61 0.172 not_significant
Sex:Age 1 0.308 0.308 1.84 0.177 not_significant
Country:Group:Age 4 0.834 0.209 1.24 0.293 not_significant
Group:Age 2 0.228 0.114 0.68 0.507 not_significant
Sex 1 0.0294 0.0294 0.175 0.676 not_significant
Sex:Group:Age 2 0.101 0.0506 0.302 0.74 not_significant
Country:Sex:Group:Age 4 0.0673 0.0168 0.1 0.982 not_significant
Age 1 1.07e-08 1.07e-08 6.36e-08 1 not_significant
Residuals 274 46 0.168 NA NA NA
all_countries_all_groups_alpha$alpha_plot_group

all_countries_all_groups_alpha$alpha_plot_antibiotics

all_countries_all_groups_alpha$alpha_by_country

### Beta diversity - metaphlan

All groups.

all_countries_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh', 'Malawi', 'Nepal'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
all_countries_beta_all_groups$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0759082 0.0009990 significant
Group:Age 0.0123544 0.0049950 significant
Sex:Group:Age 0.0081926 0.1028971 not_significant
Sex:Group 0.0079745 0.1258741 not_significant
Age 0.0038070 0.1698302 not_significant
Sex:Age 0.0037273 0.1798202 not_significant
Sex 0.0035891 0.2467532 not_significant
Total 1.0000000 NA NA
Residual 0.8844470 NA NA
bgd_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
bgd_beta_all_groups$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.2679584 0.0009990 significant
Sex:Age 0.0094338 0.1248751 not_significant
Age 0.0084979 0.1778222 not_significant
Sex:Group 0.0155854 0.2037962 not_significant
Sex 0.0078067 0.2187812 not_significant
Group:Age 0.0114756 0.5034965 not_significant
Sex:Group:Age 0.0097058 0.7192807 not_significant
Total 1.0000000 NA NA
Residual 0.6695363 NA NA
# bgd_beta_all_groups$pc12
# bgd_beta_all_groups$pc34

mal_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Malawi'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
mal_beta_all_groups$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.1942771 0.0009990 significant
Sex:Age 0.0138745 0.0219780 not_significant
Sex:Group 0.0231539 0.0249750 not_significant
Group:Age 0.0232750 0.0309690 not_significant
Age 0.0114622 0.0679321 not_significant
Sex 0.0075054 0.3336663 not_significant
Sex:Group:Age 0.0125237 0.6043956 not_significant
Total 1.0000000 NA NA
Residual 0.7139281 NA NA
# mal_beta_all_groups$pc12
# mal_beta_all_groups$pc34

nep_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Nepal'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
nep_beta_all_groups$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0898195 0.0009990 significant
Sex 0.0176933 0.1228771 not_significant
Sex:Age 0.0150035 0.2207792 not_significant
Age 0.0125478 0.3946054 not_significant
Sex:Group 0.0239139 0.4975025 not_significant
Sex:Group:Age 0.0175838 0.8821179 not_significant
Group:Age 0.0164764 0.9410589 not_significant
Total 1.0000000 NA NA
Residual 0.8069619 NA NA
all_countries_beta_all_groups$pc12 / bgd_beta_all_groups$pc12 / nep_beta_all_groups$pc12

1.6 Healthy vs Typhi

1.6.1 Plot phyla bar plots

phyla <- strataa_metaphlan_data %>% mutate(clade_name = rownames(strataa_metaphlan_data)) %>% filter(grepl("p__", clade_name)) %>% filter(!grepl("c__", clade_name)) %>% pivot_longer(!c(clade_name, lowest_taxonomic_level), names_to = "sample", values_to = "relative_abundance")
metadata_for_phyla_plots <- strataa_metaphlan_metadata %>% dplyr::select(SampleID, Group, Country)

phyla_clean_metadata <- prep_data_to_plot_phyla(phyla, strataa_metaphlan_metadata)

order_of_groups <- c("Typhi", "Healthy")
bangladesh_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Bangladesh", group_order = order_of_groups)
# bangladesh_phyla_plot
malawi_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Malawi", group_order = order_of_groups)
nepal_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Nepal", group_order = order_of_groups)

bangladesh_phyla_plot / malawi_phyla_plot / nepal_phyla_plot

# going back to phyla so that get all the weird rare ones.
# need to join to meta-data
# remove carriers
# group by clade, group, country
# summarise to get the median.
# pivot wider to get the table.
# dplyr::select(order(colnames(.))) rearranges the columns in alphabetical order
# relocate moves the clade_name column to the first column.
phyla %>% left_join(metadata_for_phyla_plots, by = c("sample" = "SampleID")) %>% filter(Group != 'Carrier') %>% group_by(clade_name, Group, Country) %>% summarise(median_prevalence = median(relative_abundance)) %>% pivot_wider(names_from = c('Country', 'Group'), values_from =  'median_prevalence') %>% dplyr::select(order(colnames(.))) %>% relocate(clade_name, .before = 1) %>% kbl() %>% kable_styling()
clade_name Bangladesh_Acute_Typhi Bangladesh_Control_HealthySerosurvey Malawi_Acute_Typhi Malawi_Control_HealthySerosurvey Nepal_Acute_Typhi Nepal_Control_HealthySerosurvey
k__Archaea|p__Candidatus_Thermoplasmatota 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Archaea|p__Euryarchaeota 0.000000 0.000000 0.249010 0.000000 0.00000 0.22474
k__Archaea|p__Thaumarchaeota 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Acidobacteria 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Actinobacteria 14.361185 13.136440 1.893725 0.524965 3.33634 3.81550
k__Bacteria|p__Bacteria_unclassified 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Bacteroidetes 0.911970 2.216730 15.361165 34.630185 28.24679 20.53695
k__Bacteria|p__Candidatus_Melainabacteria 0.000000 0.000000 0.038085 0.052210 0.00000 0.02157
k__Bacteria|p__Candidatus_Saccharibacteria 0.017310 0.015625 0.000000 0.000000 0.00000 0.00203
k__Bacteria|p__Chloroflexi 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Elusimicrobia 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Firmicutes 80.670075 72.161565 60.161350 58.311770 53.74090 60.17250
k__Bacteria|p__Fusobacteria 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Lentisphaerae 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Proteobacteria 0.629715 1.272295 6.606440 6.195335 2.90174 2.52509
k__Bacteria|p__Spirochaetes 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Synergistetes 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Tenericutes 0.000000 0.000000 0.002715 0.000000 0.01965 0.01362
k__Bacteria|p__Verrucomicrobia 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Eukaryota|p__Ascomycota 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Eukaryota|p__Eukaryota_unclassified 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000

1.6.2 alpha diversity

Alpha diversity - all countries, healthy and acute

all_countries_healthy_acute_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Bangladesh', 'Malawi', 'Nepal'), groups_of_interest = c('Acute_Typhi', 'Control_HealthySerosurvey'), comparisons = list(c('Acute_Typhi', 'Control_HealthySerosurvey')))

# all_countries_healthy_acute_alpha$alpha_by_country
all_countries_healthy_acute_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
rownames(alpha_anova_summary[[1]]) Df Sum.Sq Mean.Sq F.value Pr..F. is_it_significant
Country 2 5.74 2.87 17.4 1.36e-07 significant
Country:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 2 1.68 0.842 5.12 0.00699 significant
Sex:Age 1 1.07 1.07 6.5 0.0117 not_significant
Country:Sex 2 1.5 0.75 4.56 0.0119 not_significant
Country:Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 2 1.39 0.693 4.21 0.0165 not_significant
Country:Age 2 1.02 0.51 3.1 0.0477 not_significant
Sex:Group 1 0.628 0.628 3.82 0.0525 not_significant
Country:Antibiotics_taken_before_sampling_yes_no_assumptions 2 0.816 0.408 2.48 0.0871 not_significant
Country:Group 2 0.698 0.349 2.12 0.123 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 2 0.618 0.309 1.88 0.157 not_significant
Group 1 0.164 0.164 0.997 0.319 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 2 0.359 0.179 1.09 0.339 not_significant
Sex 1 0.0592 0.0592 0.36 0.55 not_significant
Country:Sex:Age 2 0.188 0.0938 0.57 0.567 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.0511 0.0511 0.31 0.578 not_significant
Sex:Group:Age 1 0.0368 0.0368 0.224 0.637 not_significant
Group:Age 1 0.0147 0.0147 0.0891 0.766 not_significant
Country:Group:Age 2 0.0713 0.0357 0.217 0.805 not_significant
Age 1 0.00772 0.00772 0.0469 0.829 not_significant
Country:Sex:Group 2 0.0595 0.0297 0.181 0.835 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.00675 0.00675 0.041 0.84 not_significant
Country:Sex:Group:Age 2 0.05 0.025 0.152 0.859 not_significant
Country:Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.000136 0.000136 0.000828 0.977 not_significant
Residuals 163 26.8 0.165 NA NA NA
all_countries_healthy_acute_alpha$alpha_plot_group

all_countries_healthy_acute_alpha$alpha_plot_antibiotics

alpha diversity, malawi only

malawi_healthy_acute_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Malawi'), groups_of_interest = c('Acute_Typhi', 'Control_HealthySerosurvey'), comparisons = list(c('Acute_Typhi', 'Control_HealthySerosurvey')))

malawi_healthy_acute_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
rownames(alpha_anova_summary[[1]]) Df Sum.Sq Mean.Sq F.value Pr..F. is_it_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 1 2.27 2.27 16 0.000167 significant
Antibiotics_taken_before_sampling_yes_no_assumptions 1 1.31 1.31 9.27 0.0034 significant
Sex 1 1.16 1.16 8.21 0.00567 significant
Group 1 0.84 0.84 5.93 0.0177 not_significant
Age 1 0.451 0.451 3.19 0.079 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.406 0.406 2.87 0.0952 not_significant
Sex:Group 1 0.313 0.313 2.21 0.142 not_significant
Sex:Group:Age 1 0.0978 0.0978 0.691 0.409 not_significant
Sex:Age 1 0.0164 0.0164 0.116 0.734 not_significant
Group:Age 1 0.00122 0.00122 0.00864 0.926 not_significant
Residuals 63 8.91 0.141 NA NA NA
malawi_healthy_acute_alpha$alpha_plot_group

malawi_healthy_acute_alpha$alpha_plot_antibiotics

1.6.3 beta diversity

Acute vs healthy.

all_countries_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh', 'Malawi', 'Nepal'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
all_countries_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0285252 0.0009990 significant
Group:Age 0.0117687 0.0159840 not_significant
Age 0.0092014 0.0379620 not_significant
Sex:Group:Age 0.0072077 0.1148851 not_significant
Sex 0.0070879 0.1328671 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0117438 0.1928072 not_significant
Sex:Age 0.0058394 0.2427572 not_significant
Sex:Group 0.0057923 0.2427572 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0055130 0.2827173 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0043096 0.5244755 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0070480 0.8351648 not_significant
Total 1.0000000 NA NA
Residual 0.8959630 NA NA
bgd_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
bgd_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0782718 0.0009990 significant
Sex 0.0230592 0.0299700 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0199564 0.0759241 not_significant
Sex:Age 0.0168558 0.1438561 not_significant
Age 0.0146689 0.2307692 not_significant
Group:Age 0.0109377 0.4825175 not_significant
Sex:Group:Age 0.0100500 0.5934066 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0080933 0.7442557 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0074906 0.8351648 not_significant
Sex:Group 0.0067483 0.9050949 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0059228 0.9150849 not_significant
Total 1.0000000 NA NA
Residual 0.7979452 NA NA
mal_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Malawi'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
mal_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.1995900 0.0009990 significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0371210 0.0009990 significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0381931 0.0029970 significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0267524 0.0159840 not_significant
Sex:Group 0.0200496 0.0329670 not_significant
Age 0.0194908 0.0399600 not_significant
Sex:Age 0.0144797 0.1358641 not_significant
Sex 0.0141068 0.1388611 not_significant
Sex:Group:Age 0.0091754 0.4935065 not_significant
Group:Age 0.0078947 0.6063936 not_significant
Total 1.0000000 NA NA
Residual 0.6131466 NA NA
nep_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Nepal'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
nep_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0598990 0.0029970 significant
Sex 0.0484279 0.0129870 not_significant
Sex:Group 0.0246755 0.3026973 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0244159 0.3226773 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0450942 0.3896104 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0207951 0.4775225 not_significant
Sex:Group:Age 0.0175717 0.6963037 not_significant
Age 0.0154038 0.8061938 not_significant
Sex:Age 0.0148604 0.8171828 not_significant
Group:Age 0.0102689 0.9820180 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0188359 0.9990010 not_significant
Total 1.0000000 NA NA
Residual 0.6997518 NA NA
all_countries_beta_acute_healthy$pc12 + bgd_beta_acute_healthy$pc12 + mal_beta_acute_healthy$pc12 + nep_beta_acute_healthy$pc12

1.6.4 maaslin2 taxonomy

Maaslin basics

groups_to_analyse <- c('Acute_Typhi', 'Control_HealthySerosurvey')
bang_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
mwi_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions", "sequencing_lane")
nep_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
bangladesh_taxonomic_maaslin <- read_in_maaslin('Bangladesh', groups_to_analyse, bang_variables_for_analysis, 'metaphlan')
malawi_taxonomic_maaslin <- read_in_maaslin('Malawi', groups_to_analyse, mwi_variables_for_analysis, 'metaphlan')
nepal_taxonomic_maaslin <- read_in_maaslin('Nepal', groups_to_analyse, nep_variables_for_analysis, 'metaphlan')
bangladesh_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(bangladesh_taxonomic_maaslin)
malawi_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(malawi_taxonomic_maaslin)
nepal_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(nepal_taxonomic_maaslin)
bangladesh_maaslin_stats <- basic_maaslin_stats(bangladesh_taxonomic_maaslin_filtered, 'Bangladesh', bang_variables_for_analysis, groups_to_analyse)
malawi_maaslin_stats <- basic_maaslin_stats(malawi_taxonomic_maaslin_filtered, 'Malawi', mwi_variables_for_analysis, groups_to_analyse)
nepal_maaslin_stats <- basic_maaslin_stats(nepal_taxonomic_maaslin_filtered, 'Nepal', nep_variables_for_analysis, groups_to_analyse)
malawi_maaslin_stats$volcano_plot / bangladesh_maaslin_stats$volcano_plot / nepal_maaslin_stats$volcano_plot

nrow(malawi_maaslin_stats$maaslin_results_sig)
## [1] 296
nrow(bangladesh_maaslin_stats$maaslin_results_sig)
## [1] 23
nrow(nepal_maaslin_stats$maaslin_results_sig)
## [1] 0

There were 296 species significantly (q-val < 0.05) associated with health/disease in Malawi, in Bangladesh, and in Nepal.

Combine the taxonomic maaslins, and print out the species that are sig in both and share directions.

Because sequencing run and participant type are totally confounded for Bangladesh, need to exclude sequencing run from the final model for Bangladesh (otherwise, wipes out the signals).

associated at both sites

bang_mal <- list(bangladesh_taxonomic_maaslin_filtered, malawi_taxonomic_maaslin_filtered)
combined_results <- run_combine_maaslins(bang_mal, c('_bang', '_mal'), mwi_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)

# View(combined_results$positive_coef)
# View(combined_results$negative_coef)


combined_results$positive_coef %>% filter(grepl('^s', lowest_taxonomic_level)) %>% 
  select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>% 
  rename(Species = lowest_taxonomic_level, `Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>% 
  dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% 
  kbl() %>% kable_styling()
feature Species Coefficient Bangladesh Standard Error Bangladesh Q-value Bangladesh Coefficient Malawi Standard Error Malawi Q-value Malawi
k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Clostridiaceae.g__Clostridium.s__Clostridium_SGB6179 s__Clostridium_SGB6179 8.79 1.34 4.25e-05 3.11 0.936 0.0286
k__Bacteria.p__Bacteroidetes.c__Bacteroidia.o__Bacteroidales.f__Prevotellaceae.g__Prevotella.s__Prevotella_copri_clade_A s__Prevotella_copri_clade_A 4.54 0.978 0.00971 7.64 1.25 1.21e-05
k__Bacteria.p__Firmicutes.c__Negativicutes.o__Veillonellales.f__Veillonellaceae.g__GGB4266.s__GGB4266_SGB5809 s__GGB4266_SGB5809 4.58 1.09 0.0147 6.94 0.981 4.38e-07
k__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Pasteurellales.f__Pasteurellaceae.g__Haemophilus.s__Haemophilus_parainfluenzae s__Haemophilus_parainfluenzae 3.64 0.979 0.03 6.92 0.953 2.26e-07
k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Peptostreptococcaceae.g__Romboutsia.s__Romboutsia_timonensis s__Romboutsia_timonensis 4.52 1.25 0.0386 5.07 0.903 5.91e-05
combined_results$negative_coef %>% filter(grepl('^s', lowest_taxonomic_level)) %>% 
  select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>% 
  rename(Species = lowest_taxonomic_level, `Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>% 
  dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% 
  kbl() %>% kable_styling()
feature Species Coefficient Bangladesh Standard Error Bangladesh Q-value Bangladesh Coefficient Malawi Standard Error Malawi Q-value Malawi
k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Lachnospiraceae.g__Lachnospiraceae_unclassified.s__Lachnospiraceae_bacterium s__Lachnospiraceae_bacterium -3.05 0.838 0.0366 -2.87 0.785 0.0135
# todo - refactoring the run_combine_maaslins means that we dont get the species that are only associated at one site. need to fix that.

# nrow(combined_results$mwi_maaslin_only)
# nrow(combined_results$bang_maaslin_only)

There were species significantly (q-val < 0.05) associated with health/disease in Malawi only and in Bangladesh only.

The ones associated at only one site are written out to a file, you can look at them manually there.

1.6.5 plots of species of interest

strataa_metaphlan_data_longer <- strataa_metaphlan_data %>% mutate(feature = rownames(strataa_metaphlan_data)) %>% pivot_longer(!c(feature, lowest_taxonomic_level), names_to = "SampleID", values_to = "prevalence")
strataa_metaphlan_data_longer_meta <- strataa_metaphlan_data_longer %>% left_join(strataa_metaphlan_metadata, by = c("SampleID" = "SampleID"))


pc <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Prevotella_copri_clade_A')

cs <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Clostridium_SGB6179')

SGB5809 <-run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__GGB4266_SGB5809')

hp <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Haemophilus_parainfluenzae')

rt <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Romboutsia_timonensis')

lb <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Lachnospiraceae_bacterium')

# pc + cs + SGB5809 + hp + rt + lb

1.6.6 P. copri clades

Print the results for all the P. copri clades

# all_taxa_maaslin <- combined_maaslins$all_features
# p_copri_maaslin <- all_taxa_maaslin %>% filter(grepl('_Prevotella_copri_', feature)) %>% filter(qval_bang < 0.05 | qval_mal < 0.05)
# write_csv(p_copri_maaslin, file.path(maaslin_taxonomic_output_root_folder, 'combined_maaslins_p_copri.csv'))

1.6.7 maaslin functional groups

Functional groups associated with health

bang_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
mwi_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions", "sequencing_lane")
# nep_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
groups_to_analyse <- c('Acute_Typhi', 'Control_HealthySerosurvey')

bang_func_maaslin <- read_in_maaslin('Bangladesh', groups_to_analyse, bang_variables_for_analysis, 'bigmap')
mal_func_maaslin <- read_in_maaslin('Malawi', groups_to_analyse, mwi_variables_for_analysis, 'bigmap')
# combined_results <- run_combine_maaslins(bang_func_maaslin, mal_func_maaslin, bang_variables_for_analysis, mwi_variables_for_analysis, groups_to_analyse, 'bigmap', maaslin_functional_output_root_folder)

bang_mal <- list(bang_func_maaslin, mal_func_maaslin)
combined_results <- run_combine_maaslins(bang_mal, c('_bang', '_mal'), mwi_variables_for_analysis, groups_to_analyse, 'bigmap', maaslin_functional_output_root_folder)


combined_results$positive_coef %>%
  select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>% 
  rename(`Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>% 
  dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% 
  kbl() %>% kable_styling()
MGC_class Species feature Coefficient Bangladesh Standard Error Bangladesh Q-value Bangladesh Coefficient Malawi Standard Error Malawi Q-value Malawi
succinate2propionate Dialister_invisus_DSM_15470_genomic_scaffold gb.GG698602.1.region002.GC_DNA..Entryname.succinate2propionate..OS.Dialister_invisus_DSM_15470_genomic_scaffold..SMASHregion.region002..NR.1 4.49 0.769 0.000238 4.32 0.937 0.00196
Pyruvate2acetate.formate Prevotella_copri_strain_AM22.19 gb.QRIF01000001.1.region001.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Prevotella_copri_strain_AM22.19..SMASHregion.region001..NR.7 4.95 0.865 0.000259 4.76 1.23 0.0122
TPP_AA_metabolism Prevotella_sp._S7_MS_2 gb.JRPT01000001.1.region001.GC_DNA..Entryname.TPP_AA_metabolism..OS.Prevotella_sp._S7_MS_2..SMASHregion.region001..NR.11..BG.2 4.86 0.89 0.000381 5.14 1.16 0.00314
Arginine2_Hcarbonate .Clostridium._sordellii_genome_assembly gb.LN679998.1.region007.GC_DNA..Entryname.Arginine2_Hcarbonate..OS..Clostridium._sordellii_genome_assembly..SMASHregion.region007..NR.2 4.59 0.824 0.000381 3.51 0.732 0.00116
Others_HGD_unassigned Prevotella_copri_strain_AF24.12 gb.QRVA01000039.1.region001.GC_DNA..Entryname.Others_HGD_unassigned..OS.Prevotella_copri_strain_AF24.12..SMASHregion.region001..NR.6 4.72 0.862 0.000381 5.74 1.09 0.000359
Pyruvate2acetate.formate Prevotella_sp._AM34.19LB gb.QSIG01000002.1.region002.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Prevotella_sp._AM34.19LB..SMASHregion.region002..NR.3 4.25 0.783 0.000381 3.28 0.956 0.0336
TPP_fatty_acids Prevotella_copri_strain_AM22.19 gb.QRIF01000003.1.region001.GC_DNA..Entryname.TPP_fatty_acids..OS.Prevotella_copri_strain_AM22.19..SMASHregion.region001..NR.6 5.07 0.952 0.000422 5.33 1.07 0.000706
Rnf_complex Prevotella_copri_strain_AF38.11 gb.QROP01000022.1.region001.GC_DNA..Entryname.Rnf_complex..OS.Prevotella_copri_strain_AF38.11..SMASHregion.region001..NR.6 4.74 0.914 0.000649 5.23 1.12 0.00176
PFOR_II_pathway Romboutsia_ilealis_strain_CRIB_genome gb.LN555523.1.region001.GC_DNA..Entryname.PFOR_II_pathway..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region001..NR.1 4.84 0.983 0.00104 2.98 0.719 0.00629
Others_HGD_unassigned Romboutsia_ilealis_strain_CRIB_genome gb.LN555523.1.region003.GC_DNA..Entryname.Others_HGD_unassigned..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region003..NR.1 4.59 0.935 0.00105 2.76 0.558 0.000812
porA Prevotella_copri_strain_AF10.17 gb.QSAV01000013.1.region001.GC_DNA..Entryname.porA..OS.Prevotella_copri_strain_AF10.17..SMASHregion.region001..NR.6 4.46 0.918 0.00124 5.08 1.17 0.00387
Rnf_complex Romboutsia_ilealis_strain_CRIB_genome gb.LN555523.1.region002.GC_DNA..Entryname.Rnf_complex..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region002..NR.1 4.36 0.957 0.0031 2.83 0.622 0.00228
PFOR_II_pathway Romboutsia_ilealis_strain_CRIB_genome gb.LN555523.1.region004.GC_DNA..Entryname.PFOR_II_pathway..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region004..NR.1 4.39 0.968 0.00318 3.37 0.814 0.00646
Formate_dehydrogenase Haemophilus_sp._HMSC61B11_genomic_scaffold gb.KV837963.1.region001.GC_DNA..Entryname.Formate_dehydrogenase..OS.Haemophilus_sp._HMSC61B11_genomic_scaffold..SMASHregion.region001..NR.5 3.41 0.754 0.00327 4.89 0.985 0.000767
Pyruvate2acetate.formate.Acetyl.CoA_pathway Romboutsia_ilealis_strain_CRIB_genome gb.LN555523.1.region005.GC_DNA..Entryname.Pyruvate2acetate.formate.Acetyl.CoA_pathway..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region005..NR.1 4.26 1 0.0072 4.58 1.11 0.00659
Pyruvate2acetate.formate Clostridium_carboxidivorans gb.CP011803.1.region013.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Clostridium_carboxidivorans..SMASHregion.region013..NR.1 3.69 0.89 0.00954 3.11 0.762 0.00733
OD_eut_pdu_related.PFOR_II_pathway Paeniclostridium_sordellii_strain_AM370 gb.CP014150.1.region011.GC_DNA..Entryname.OD_eut_pdu_related.PFOR_II_pathway..OS.Paeniclostridium_sordellii_strain_AM370..SMASHregion.region011..NR.4 2.61 0.629 0.00954 3.94 0.574 4.58e-06
TPP_AA_metabolism.Arginine2putrescine Haemophilus_sp._HMSC61B11_genomic_scaffold gb.KV837955.1.region001.GC_DNA..Entryname.TPP_AA_metabolism.Arginine2putrescine..OS.Haemophilus_sp._HMSC61B11_genomic_scaffold..SMASHregion.region001..NR.5 3.93 0.951 0.00959 4.39 0.918 0.0012
Respiratory_glycerol Haemophilus_parainfluenzae_HK2019 gb.AJTC01000042.1.region001.GC_DNA..Entryname.Respiratory_glycerol..OS.Haemophilus_parainfluenzae_HK2019..SMASHregion.region001..NR.5 3.55 0.882 0.0136 5.14 0.975 0.000344
Pyruvate2acetate.formate Haemophilus_parainfluenzae_HK262 gb.AJMW01000041.1.region001.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Haemophilus_parainfluenzae_HK262..SMASHregion.region001..NR.5 3.17 0.791 0.0137 5.08 1.01 0.000657
TPP_AA_metabolism Clostridium_celatum_DSM_1785_genomic_scaffold gb.KB291615.1.region002.GC_DNA..Entryname.TPP_AA_metabolism..OS.Clostridium_celatum_DSM_1785_genomic_scaffold..SMASHregion.region002..NR.1 3.24 0.813 0.0138 2.5 0.72 0.0303
Arginine2putrescine.Putrescine2spermidine Romboutsia_sp._Frifi_strain_FRIFI_genome gb.LN650648.1.region002.GC_DNA..Entryname.Arginine2putrescine.Putrescine2spermidine..OS.Romboutsia_sp._Frifi_strain_FRIFI_genome..SMASHregion.region002..NR.1 2.66 0.683 0.0177 2.57 0.523 0.000868
Pyruvate2acetate.formate Haemophilus_haemolyticus_HK386 gb.AJSV01000030.1.region001.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Haemophilus_haemolyticus_HK386..SMASHregion.region001..NR.2 2.38 0.631 0.0234 5.02 0.724 3.95e-06
Pyruvate2acetate.formate Haemophilus_aegyptius_ATCC_11116_genomic_scaffold gb.GL878527.1.region001.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Haemophilus_aegyptius_ATCC_11116_genomic_scaffold..SMASHregion.region001..NR.2 2.16 0.574 0.0236 3.52 0.679 0.000428
acetate2butyrate Clostridium_botulinum_B_str._Eklund gb.CP001056.1.region002.GC_DNA..Entryname.acetate2butyrate..OS.Clostridium_botulinum_B_str._Eklund..SMASHregion.region002..NR.5..BG.2 2.42 0.646 0.0245 2.53 0.754 0.0396
OD_fatty_acids Dialister_succinatiphilus_YIT_11850_genomic_scaffold gb.JH591188.1.region001.GC_DNA..Entryname.OD_fatty_acids..OS.Dialister_succinatiphilus_YIT_11850_genomic_scaffold..SMASHregion.region001..NR.1 2.94 0.793 0.0265 2.54 0.579 0.00342
Pyruvate2acetate.formate Clostridium_butyricum_strain_JKY6D1_chromosome gb.CP013352.1.region004.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Clostridium_butyricum_strain_JKY6D1_chromosome..SMASHregion.region004..NR.6 3.46 0.938 0.0265 3.35 0.936 0.0241
TPP_AA_metabolism Haemophilus_sputorum_HK_2154 gb.ALJP01000004.1.region001.GC_DNA..Entryname.TPP_AA_metabolism..OS.Haemophilus_sputorum_HK_2154..SMASHregion.region001..NR.1 2.96 0.812 0.0299 4.52 0.866 0.000392
Fumarate2succinate Haemophilus_sp._HMSC61B11_genomic_scaffold gb.KV838003.1.region001.GC_DNA..Entryname.Fumarate2succinate..OS.Haemophilus_sp._HMSC61B11_genomic_scaffold..SMASHregion.region001..NR.5 3.34 0.924 0.0312 4.62 1.01 0.0022
Respiratory_glycerol Aggregatibacter_sp._oral_taxon_458_str._W10330_genomic_scaffold gb.KE952617.1.region001.GC_DNA..Entryname.Respiratory_glycerol..OS.Aggregatibacter_sp._oral_taxon_458_str._W10330_genomic_scaffold..SMASHregion.region001..NR.1 2.91 0.808 0.0323 4.64 0.926 0.000678
TPP_AA_metabolism Haemophilus_pittmaniae_HK_85 gb.AFUV01000010.1.region001.GC_DNA..Entryname.TPP_AA_metabolism..OS.Haemophilus_pittmaniae_HK_85..SMASHregion.region001..NR.1 2.43 0.686 0.0353 5.22 0.896 8.17e-05
PFOR_II_pathway Clostridium_celatum_DSM_1785_genomic_scaffold gb.KB291660.1.region001.GC_DNA..Entryname.PFOR_II_pathway..OS.Clostridium_celatum_DSM_1785_genomic_scaffold..SMASHregion.region001..NR.1 3.51 0.994 0.0353 3.77 0.982 0.0132
acetate2butyrate Clostridium_celatum_DSM_1785_genomic_scaffold gb.KB291615.1.region001.GC_DNA..Entryname.acetate2butyrate..OS.Clostridium_celatum_DSM_1785_genomic_scaffold..SMASHregion.region001..NR.1 3.41 0.981 0.0397 3.41 0.931 0.0197
Rnf_complex Haemophilus_parainfluenzae_HK262 gb.AJMW01000064.1.region001.GC_DNA..Entryname.Rnf_complex..OS.Haemophilus_parainfluenzae_HK262..SMASHregion.region001..NR.6..BG.2 2.8 0.814 0.0418 5 0.992 0.000636
OD_fatty_acids.PFOR_II_pathway Clostridium_botulinum_B_str._Eklund gb.CP001056.1.region005.GC_DNA..Entryname.OD_fatty_acids.PFOR_II_pathway..OS.Clostridium_botulinum_B_str._Eklund..SMASHregion.region005..NR.5..BG.2 2.57 0.772 0.0497 3.16 0.835 0.0148
combined_results$negative_coef %>%
  select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>% 
  rename(`Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>% 
  dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% 
  kbl() %>% kable_styling()
MGC_class Species feature Coefficient Bangladesh Standard Error Bangladesh Q-value Bangladesh Coefficient Malawi Standard Error Malawi Q-value Malawi
NADH_dehydrogenase_I Actinomyces_viscosus_C505_genomic_scaffold gb.KI391968.1.region002.GC_DNA..Entryname.NADH_dehydrogenase_I..OS.Actinomyces_viscosus_C505_genomic_scaffold..SMASHregion.region002..NR.5 -3.01 0.574 0.000568 -2.9 0.796 0.0205
Nitrate_reductase Actinomyces_johnsonii_F0510_genomic_scaffold gb.KE951633.1.region001.GC_DNA..Entryname.Nitrate_reductase..OS.Actinomyces_johnsonii_F0510_genomic_scaffold..SMASHregion.region001..NR.2 -2.76 0.544 0.000832 -2.26 0.657 0.0332
Rnf_complex.TPP_AA_metabolism Eubacterium_siraeum_DSM_15702_Scfld_03_40_genomic gb.DS499548.1.region001.GC_DNA..Entryname.Rnf_complex.TPP_AA_metabolism..OS.Eubacterium_siraeum_DSM_15702_Scfld_03_40_genomic..SMASHregion.region001..NR.2 -2.44 0.727 0.047 -4.29 1.23 0.0298
combined_results$positive_coef %>% mutate(split_species = str_split(Species, '_')) %>% mutate(genus = sapply(split_species, "[[", 1)) %>% mutate(specific = sapply(split_species, "[[", 2)) %>% mutate(clean_species = paste(genus, specific, sep = ' ')) %>% select(-split_species, -genus, -specific) %>% relocate(clean_species, .after = Species) %>% group_by(MGC_class) %>% arrange(Species) %>% summarise(n = n(), Species = paste(clean_species, collapse = ', ')) %>% arrange(desc(n), MGC_class) %>%  kbl() %>% kable_styling()
MGC_class n Species
Pyruvate2acetate.formate 7 Clostridium butyricum, Clostridium carboxidivorans, Haemophilus aegyptius, Haemophilus haemolyticus, Haemophilus parainfluenzae, Prevotella copri, Prevotella sp.
TPP_AA_metabolism 4 Clostridium celatum, Haemophilus pittmaniae, Haemophilus sputorum, Prevotella sp.
PFOR_II_pathway 3 Clostridium celatum, Romboutsia ilealis, Romboutsia ilealis
Rnf_complex 3 Haemophilus parainfluenzae, Prevotella copri, Romboutsia ilealis
Others_HGD_unassigned 2 Prevotella copri, Romboutsia ilealis
Respiratory_glycerol 2 Aggregatibacter sp., Haemophilus parainfluenzae
acetate2butyrate 2 Clostridium botulinum, Clostridium celatum
Arginine2_Hcarbonate 1 .Clostridium. sordellii
Arginine2putrescine.Putrescine2spermidine 1 Romboutsia sp.
Formate_dehydrogenase 1 Haemophilus sp.
Fumarate2succinate 1 Haemophilus sp.
OD_eut_pdu_related.PFOR_II_pathway 1 Paeniclostridium sordellii
OD_fatty_acids 1 Dialister succinatiphilus
OD_fatty_acids.PFOR_II_pathway 1 Clostridium botulinum
Pyruvate2acetate.formate.Acetyl.CoA_pathway 1 Romboutsia ilealis
TPP_AA_metabolism.Arginine2putrescine 1 Haemophilus sp.
TPP_fatty_acids 1 Prevotella copri
porA 1 Prevotella copri
succinate2propionate 1 Dialister invisus
combined_results$negative_coef %>% mutate(split_species = str_split(Species, '_')) %>% mutate(genus = sapply(split_species, "[[", 1)) %>% mutate(specific = sapply(split_species, "[[", 2)) %>% mutate(clean_species = paste(genus, specific, sep = ' ')) %>% select(-split_species, -genus, -specific) %>% relocate(clean_species, .after = Species) %>% group_by(MGC_class) %>% arrange(Species) %>% summarise(n = n(), Species = paste(clean_species, collapse = ', ')) %>% arrange(desc(n), MGC_class) %>%  kbl() %>% kable_styling()
MGC_class n Species
NADH_dehydrogenase_I 1 Actinomyces viscosus
Nitrate_reductase 1 Actinomyces johnsonii
Rnf_complex.TPP_AA_metabolism 1 Eubacterium siraeum

1.7 healthy vs carrier

1.7.1 phylum plots

order_of_groups <- c("Carrier", "Healthy")
# bangladesh_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Bangladesh", group_order = order_of_groups)
# bangladesh_phyla_plot
malawi_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Malawi", group_order = order_of_groups)
nepal_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Nepal", group_order = order_of_groups)

# bangladesh_phyla_plot / 
malawi_phyla_plot / nepal_phyla_plot

1.7.2 alpha diversity

Alpha diversity - all countries, healthy and carrier

all_countries_healthy_carrier_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Malawi', 'Nepal'), groups_of_interest = c('Carrier', 'Control_HealthySerosurvey'), comparisons = list(c('Carrier', 'Control_HealthySerosurvey')))
all_countries_healthy_carrier_alpha$alpha_by_country

all_countries_healthy_carrier_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
rownames(alpha_anova_summary[[1]]) Df Sum.Sq Mean.Sq F.value Pr..F. is_it_significant
Country:Group 1 4.82 4.82 40.9 3.97e-09 significant
Group 1 1.41 1.41 11.9 0.000786 significant
Group:Age 1 0.639 0.639 5.42 0.0217 not_significant
Country:Sex:Age 1 0.467 0.467 3.96 0.0491 not_significant
Country 1 0.438 0.438 3.72 0.0564 not_significant
Sex:Group 1 0.35 0.35 2.97 0.0875 not_significant
Age 1 0.0966 0.0966 0.819 0.367 not_significant
Country:Sex:Group 1 0.0738 0.0738 0.626 0.43 not_significant
Sex:Age 1 0.0673 0.0673 0.571 0.451 not_significant
Country:Group:Age 1 0.0151 0.0151 0.128 0.721 not_significant
Country:Sex 1 0.0139 0.0139 0.118 0.732 not_significant
Sex 1 0.00909 0.00909 0.0771 0.782 not_significant
Country:Age 1 0.00766 0.00766 0.065 0.799 not_significant
Country:Sex:Group:Age 1 0.00184 0.00184 0.0156 0.901 not_significant
Sex:Group:Age 1 6.12e-06 6.12e-06 5.19e-05 0.994 not_significant
Residuals 111 13.1 0.118 NA NA NA
all_countries_healthy_carrier_alpha$alpha_plot_group

1.7.3 beta diversity

Carrier vs healthy.

all_countries_beta_carrier_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Malawi', 'Nepal'), c('Carrier', 'Control_HealthySerosurvey'))
all_countries_beta_carrier_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0653601 0.0009990 significant
Sex:Age 0.0141431 0.0199800 not_significant
Group:Age 0.0127135 0.0469530 not_significant
Age 0.0090662 0.2367632 not_significant
Sex:Group 0.0077753 0.3676324 not_significant
Sex 0.0072119 0.4195804 not_significant
Sex:Group:Age 0.0068119 0.5054945 not_significant
Total 1.0000000 NA NA
Residual 0.8769180 NA NA
# bgd_beta_carrier_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh'), c('Carrier', 'Control_HealthySerosurvey'))
# bgd_beta_carrier_healthy$pn_res %>% kbl %>% kable_styling()

mal_beta_carrier_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Malawi'), c('Carrier', 'Control_HealthySerosurvey'))
mal_beta_carrier_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.1999878 0.0009990 significant
Sex:Age 0.0304467 0.0049950 significant
Age 0.0187062 0.0519481 not_significant
Group:Age 0.0147510 0.1348651 not_significant
Sex:Group 0.0118411 0.2627373 not_significant
Sex 0.0099215 0.4385614 not_significant
Sex:Group:Age 0.0046358 0.9080919 not_significant
Total 1.0000000 NA NA
Residual 0.7097099 NA NA
nep_beta_carrier_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Nepal'), c('Carrier', 'Control_HealthySerosurvey'))
nep_beta_carrier_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0836999 0.0009990 significant
Sex:Group 0.0241616 0.2477522 not_significant
Sex:Age 0.0240379 0.2687313 not_significant
Age 0.0195876 0.4735265 not_significant
Sex 0.0174225 0.6013986 not_significant
Sex:Group:Age 0.0149025 0.7352647 not_significant
Group:Age 0.0098712 0.9870130 not_significant
Total 1.0000000 NA NA
Residual 0.8063168 NA NA
mal_beta_carrier_healthy$pc12 + nep_beta_carrier_healthy$pc12 + plot_layout(guides = 'collect')

1.7.4 maaslin taxonomic groups

groups_to_analyse <- c('Carrier', 'Control_HealthySerosurvey')
# bang_variables_for_analysis <- c("Group", "Sex", "Age")
mwi_variables_for_analysis <- c("Group", "Sex", "Age", "sequencing_lane")
nep_variables_for_analysis <- c("Group", "Sex", "Age")
# bangladesh_taxonomic_maaslin <- read_in_maaslin('Bangladesh', groups_to_analyse, bang_variables_for_analysis, 'metaphlan')
malawi_taxonomic_maaslin <- read_in_maaslin('Malawi', groups_to_analyse, mwi_variables_for_analysis, 'metaphlan')
nepal_taxonomic_maaslin <- read_in_maaslin('Nepal', groups_to_analyse, nep_variables_for_analysis, 'metaphlan')
# bangladesh_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(bangladesh_taxonomic_maaslin)
malawi_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(malawi_taxonomic_maaslin)
nepal_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(nepal_taxonomic_maaslin)
# bangladesh_maaslin_stats <- basic_maaslin_stats(bangladesh_taxonomic_maaslin_filtered, 'Bangladesh', bang_variables_for_analysis, groups_to_analyse)
malawi_maaslin_stats <- basic_maaslin_stats(malawi_taxonomic_maaslin_filtered, 'Malawi', mwi_variables_for_analysis, groups_to_analyse)
nepal_maaslin_stats <- basic_maaslin_stats(nepal_taxonomic_maaslin_filtered, 'Nepal', nep_variables_for_analysis, groups_to_analyse)
# / bangladesh_maaslin_stats$volcano_plot 
malawi_maaslin_stats$volcano_plot / nepal_maaslin_stats$volcano_plot

nrow(malawi_maaslin_stats$maaslin_results_sig)
## [1] 117
# nrow(bangladesh_maaslin_stats$maaslin_results_sig)
nrow(nepal_maaslin_stats$maaslin_results_sig)
## [1] 1

We’re not including the bangladesh samples in this analysis, because the bangladesh carriers were processed differently (extracted without being frozen).

There are no species significantly associated with carrier status in all Mal and Nep, so not including the combine.

# combined_results <- run_combine_maaslins(bangladesh_taxonomic_maaslin_filtered, malawi_taxonomic_maaslin_filtered, bang_variables_for_analysis, mwi_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)

# only one species significantly associated in Nepal.
# there are no species consistently associated across all three sites
# bang_mal_nep <- list(bangladesh_taxonomic_maaslin_filtered, malawi_taxonomic_maaslin_filtered, nepal_taxonomic_maaslin_filtered)
# combined_results <- run_combine_maaslins(bang_mal_nep, c('_bang', '_mwi', '_nep'), mwi_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)
# View(combined_results$positive_coef)
# View(combined_results$negative_coef)

# there are no species associated in both bang and nep
# bang_nep <- list(bangladesh_taxonomic_maaslin_filtered, nepal_taxonomic_maaslin_filtered)
# combined_results <- run_combine_maaslins(bang_nep, c('_bang', '_nep'), bang_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)
# View(combined_results$positive_coef)
# View(combined_results$negative_coef)

# there are no species associated in both mal and nep
# mal_nep <- list(malawi_taxonomic_maaslin_filtered, nepal_taxonomic_maaslin_filtered)
# combined_results <- run_combine_maaslins(bang_nep, c('_mal', '_nep'), bang_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)
# View(combined_results$positive_coef)
# View(combined_results$negative_coef)

# bang_mal <- list(bangladesh_taxonomic_maaslin_filtered, malawi_taxonomic_maaslin_filtered)
# combined_results <- run_combine_maaslins(bang_mal, c('_bang', '_mal'), mwi_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)

# # positive is associated with health
# combined_results$positive_coef %>% filter(grepl('^s', lowest_taxonomic_level)) %>% 
#   select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>% 
#   rename(Species = lowest_taxonomic_level, `Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>% 
#   dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% 
#   kbl() %>% kable_styling()

# # negative is associated with carrier
# combined_results$negative_coef %>% filter(grepl('^s', lowest_taxonomic_level)) %>% 
#   select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>% 
#   rename(Species = lowest_taxonomic_level, `Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>% 
#   dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% 
#   kbl() %>% kable_styling()

# nrow(combined_results$mwi_maaslin_only)
# nrow(combined_results$bang_maaslin_only)

1.7.5 functional groups

# bang_variables_for_analysis <- c("Group", "Sex", "Age")
mwi_variables_for_analysis <- c("Group", "Sex", "Age", "sequencing_lane")
# nep_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
groups_to_analyse <- c('Carrier', 'Control_HealthySerosurvey')

# bang_func_carr_health_maaslin <- read_in_maaslin('Bangladesh', groups_to_analyse, bang_variables_for_analysis, 'bigmap')
mal_func_carr_health_maaslin <- read_in_maaslin('Malawi', groups_to_analyse, mwi_variables_for_analysis, 'bigmap')
# combined_results <- run_combine_maaslins(bang_func_carr_health_maaslin, mal_func_carr_health_maaslin, bang_variables_for_analysis, mwi_variables_for_analysis, groups_to_analyse, 'bigmap', maaslin_functional_output_root_folder)

# bang_mal <- list(bang_func_carr_health_maaslin, mal_func_carr_health_maaslin)
# combined_results <- run_combine_maaslins(bang_mal, c('_bang', '_mal'), mwi_variables_for_analysis, groups_to_analyse, 'bigmap', maaslin_functional_output_root_folder)

# # positive is associated with health
# # combined_results$positive_coef %>%  kbl() %>% kable_styling()
# combined_results$positive_coef %>% 
#   select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>% 
#   rename(`Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>% 
#   dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% 
#   kbl() %>% kable_styling()
# # negative is associated with carrier
# # combined_results$negative_coef %>%  kbl() %>% kable_styling()
# combined_results$negative_coef %>% 
#   select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>% 
#   rename(`Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>% 
#   dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% 
#   kbl() %>% kable_styling()


# # print out a nicely formatted table of the count of the MGCs
# combined_results$positive_coef %>% mutate(split_species = str_split(Species, '_')) %>% mutate(genus = sapply(split_species, "[[", 1)) %>% mutate(specific = sapply(split_species, "[[", 2)) %>% mutate(clean_species = paste(genus, specific, sep = ' ')) %>% select(-split_species, -genus, -specific) %>% relocate(clean_species, .after = Species) %>% group_by(MGC_class) %>% arrange(Species) %>% summarise(n = n(), Species = paste(clean_species, collapse = ', ')) %>% arrange(desc(n), MGC_class) %>%  kbl() %>% kable_styling()
# combined_results$negative_coef %>% mutate(split_species = str_split(Species, '_')) %>% mutate(genus = sapply(split_species, "[[", 1)) %>% mutate(specific = sapply(split_species, "[[", 2)) %>% mutate(clean_species = paste(genus, specific, sep = ' ')) %>% select(-split_species, -genus, -specific) %>% relocate(clean_species, .after = Species) %>% group_by(MGC_class) %>% arrange(Species) %>% summarise(n = n(), Species = paste(clean_species, collapse = ', ')) %>% arrange(desc(n), MGC_class) %>%  kbl() %>% kable_styling()

1.8 healthy vs acute vs carriers - taxonomic

groups_to_analyse <- c('Carrier', 'Control_HealthySerosurvey', 'Acute_Typhi')
variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions", "sequencing_lane")
heal_ac_car_taxonomic <- read_in_maaslin('Malawi', groups_to_analyse, variables_for_analysis, 'metaphlan')
heal_ac_car_taxonomic_cln <- filter_taxonomic_maaslin(heal_ac_car_taxonomic) #%>% filter(qval <= 0.05) 
# spread heal_ac_car_taxonomic_cln wider so that the coef is on the same row for each feature
heal_ac_car_taxonomic_cln_w <- heal_ac_car_taxonomic_cln %>% pivot_wider(names_from = value, values_from = c(coef, qval), id_cols = c(feature, lowest_taxonomic_level))

changes relative to healthy state

# same_same cant do from here because filtered for qvalue
# 1
up_up <- heal_ac_car_taxonomic_cln_w %>% filter(coef_Acute_Typhi > 0 & coef_Carrier > 0 & qval_Acute_Typhi < 0.05 & qval_Carrier < 0.05) %>% mutate(type = '1.up_up')
print(nrow(up_up))
## [1] 68
# write_csv(up_up, file.path(maaslin_taxonomic_output_root_folder, 'blantyre.health_acute_carrier.1.up_up.csv'))
# 2
up_down <- heal_ac_car_taxonomic_cln_w %>% filter(coef_Acute_Typhi > 0 & coef_Carrier < 0 & qval_Acute_Typhi < 0.05 & qval_Carrier < 0.05) %>% mutate(type = '2.up_down')
print(nrow(up_down))
## [1] 0
# write_csv(up_down, file.path(maaslin_taxonomic_output_root_folder, 'blantyre.health_acute_carrier.2.up_down.csv'))
# 3
up_same <- heal_ac_car_taxonomic_cln_w %>% filter(coef_Acute_Typhi > 0 & qval_Acute_Typhi < 0.05 & qval_Carrier >= 0.05) %>% mutate(type = '3.up_same')
print(nrow(up_same))
## [1] 102
# write_csv(up_same, file.path(maaslin_taxonomic_output_root_folder, 'blantyre.health_acute_carrier.3.up_same.csv'))
# 4
down_up <- heal_ac_car_taxonomic_cln_w %>% filter(coef_Acute_Typhi < 0 & coef_Carrier > 0 & qval_Acute_Typhi < 0.05 & qval_Carrier < 0.05) %>% mutate(type = '4.down_up')
print(nrow(down_up))
## [1] 0
# write_csv(down_up, file.path(maaslin_taxonomic_output_root_folder, 'blantyre.health_acute_carrier.4.down_up.csv'))
# 5
down_down <- heal_ac_car_taxonomic_cln_w %>% filter(coef_Acute_Typhi < 0 & coef_Carrier < 0 & qval_Acute_Typhi < 0.05 & qval_Carrier < 0.05) %>% mutate(type = '5.down_down')
print(nrow(down_down))
## [1] 85
# write_csv(down_down, file.path(maaslin_taxonomic_output_root_folder, 'blantyre.health_acute_carrier.5.down_down.csv'))
# 6
down_same <- heal_ac_car_taxonomic_cln_w %>% filter(coef_Acute_Typhi < 0 & qval_Acute_Typhi < 0.05 & qval_Carrier > 0.05) %>% mutate(type = '6.down_same')
print(nrow(down_same))
## [1] 59
# write_csv(down_same, file.path(maaslin_taxonomic_output_root_folder, 'blantyre.health_acute_carrier.6.down_same.csv'))
# 7
same_up <- heal_ac_car_taxonomic_cln_w %>% filter(qval_Acute_Typhi > 0.05 & qval_Carrier < 0.05 & coef_Carrier > 0) %>% mutate(type = '7.same_up')
print(nrow(same_up))
## [1] 90
# write_csv(same_up, file.path(maaslin_taxonomic_output_root_folder, 'blantyre.health_acute_carrier.7.same_up.csv'))
# 8
same_down <- heal_ac_car_taxonomic_cln_w %>% filter(qval_Acute_Typhi > 0.05 & qval_Carrier < 0.05 & coef_Carrier < 0) %>% mutate(type = '8.same_down')
print(nrow(same_down))
## [1] 15
# 9
same_same <- heal_ac_car_taxonomic_cln_w %>% filter(qval_Acute_Typhi >= 0.05 & qval_Carrier >= 0.05) %>% mutate(type = '9.same_same')
print(nrow(same_same))
## [1] 970
all_patterns <- rbind(up_up, up_down, up_same, down_up, down_down, down_same, same_up, same_down, same_same)

write_csv(all_patterns, file.path(maaslin_taxonomic_output_root_folder, 'blantyre.health_acute_carrier.all_patterns.csv')) 

up_in_ac_car <- all_patterns %>% filter(type %in% c('1.up_up', '2.up_down', '3.up_same', '4.down_up', '7.same_up')) 
down_in_ac_car <- all_patterns %>% filter(type %in% c('5.down_down', '6.down_same', '8.same_down'))
up_in_health_only <- all_patterns %>% filter(type == '5.down_down')
up_in_car_only <- all_patterns %>% filter(type == '7.same_up')

prop_that_is_sgb <- function(input_data) {
  x <- input_data %>% filter(str_detect(feature, 'SGB')) %>% summarise(prop = n()/nrow(input_data), n_sgb = n(), total_n = nrow(input_data))
  print(x)
  return(x)
}
prop_up_in_ac_car <- prop_that_is_sgb(up_in_ac_car)
## # A tibble: 1 × 3
##    prop n_sgb total_n
##   <dbl> <int>   <int>
## 1 0.781   203     260
prop_down_in_ac_car <- prop_that_is_sgb(down_in_ac_car)
## # A tibble: 1 × 3
##    prop n_sgb total_n
##   <dbl> <int>   <int>
## 1 0.258    41     159
prop_up_in_health_only <- prop_that_is_sgb(up_in_health_only)
## # A tibble: 1 × 3
##    prop n_sgb total_n
##   <dbl> <int>   <int>
## 1 0.235    20      85
prop_up_in_car_only <- prop_that_is_sgb(up_in_car_only)
## # A tibble: 1 × 3
##    prop n_sgb total_n
##   <dbl> <int>   <int>
## 1 0.811    73      90
prop.test(x = c(prop_up_in_ac_car$n_sgb, prop_down_in_ac_car$n_sgb), n = c(prop_up_in_ac_car$total_n, prop_down_in_ac_car$total_n), alternative = 'two.sided', correct = FALSE)
## 
##  2-sample test for equality of proportions without continuity
##  correction
## 
## data:  c(prop_up_in_ac_car$n_sgb, prop_down_in_ac_car$n_sgb) out of c(prop_up_in_ac_car$total_n, prop_down_in_ac_car$total_n)
## X-squared = 110.92, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.4383352 0.6074800
## sample estimates:
##    prop 1    prop 2 
## 0.7807692 0.2578616
prop.test(x = c(prop_up_in_health_only$n_sgb, prop_up_in_car_only$n_sgb), n = c(prop_up_in_health_only$total_n, prop_up_in_car_only$total_n), alternative = 'two.sided', correct = FALSE)
## 
##  2-sample test for equality of proportions without continuity
##  correction
## 
## data:  c(prop_up_in_health_only$n_sgb, prop_up_in_car_only$n_sgb) out of c(prop_up_in_health_only$total_n, prop_up_in_car_only$total_n)
## X-squared = 58.207, df = 1, p-value = 2.36e-14
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.6969416 -0.4546924
## sample estimates:
##    prop 1    prop 2 
## 0.2352941 0.8111111
# make a new strataa_metaphlan_data object where the rowname column is an actual column

strataa_metaphlan_data_cln <- strataa_metaphlan_data %>% mutate(feature = rownames(strataa_metaphlan_data)) 
rownames(strataa_metaphlan_data_cln) <- NULL
strataa_metaphlan_data_l <- strataa_metaphlan_data_cln %>% pivot_longer(!c(feature, lowest_taxonomic_level), names_to = 'sample', values_to = 'abundance') %>% left_join(strataa_metaphlan_metadata %>% mutate(sample = rownames(strataa_metaphlan_metadata)), by = 'sample')
# head(strataa_metaphlan_data_l)
strataa_metaphlan_data_mwi_sp <- strataa_metaphlan_data_l %>% filter(Country == 'Malawi', str_detect(feature, 's__'), str_detect(feature, 't__', negate = TRUE)) %>% mutate(SGB = str_detect(feature, '_SGB'))
head(strataa_metaphlan_data_mwi_sp)
## # A tibble: 6 × 13
##   lowest_taxonomic_level feature      sample abundance Group Sex   Country   Age
##   <chr>                  <chr>        <chr>      <dbl> <chr> <chr> <chr>   <dbl>
## 1 s__Prevotella_sp_885   k__Bacteria… 32580…     0     Acut… Fema… Malawi   9.71
## 2 s__Prevotella_sp_885   k__Bacteria… 32580…     0     Acut… Fema… Malawi   3.84
## 3 s__Prevotella_sp_885   k__Bacteria… 32580…     0     Acut… Male  Malawi   5.63
## 4 s__Prevotella_sp_885   k__Bacteria… 32580…     0.150 Acut… Fema… Malawi  20.3 
## 5 s__Prevotella_sp_885   k__Bacteria… 32580…     0     Acut… Male  Malawi   7.51
## 6 s__Prevotella_sp_885   k__Bacteria… 32580…     0     Acut… Fema… Malawi   3.84
## # ℹ 5 more variables:
## #   Antibiotics_taken_before_sampling_yes_no_assumptions <chr>,
## #   sequencing_lane <chr>, SampleID <chr>, age_bracket <fct>, SGB <lgl>
mwi_sgb_abundance <- strataa_metaphlan_data_mwi_sp %>% group_by(sample, SGB) %>% summarise(total_abundance = sum(abundance)) %>% left_join(strataa_metaphlan_metadata %>% mutate(sample = rownames(strataa_metaphlan_metadata)), by = 'sample')  %>% mutate(SGB = ifelse(SGB, 'SGB', 'non-SGB')) %>% mutate(Group = factor(Group, levels = c('Control_HealthySerosurvey', 'Acute_Typhi', 'Carrier')))
mwi_sgb_abundance %>% group_by(SGB, Group) %>% summarise(median_abundance = median(total_abundance), n = n()) %>% kbl() %>% kable_styling()
SGB Group median_abundance n
SGB Control_HealthySerosurvey 13.88699 40
SGB Acute_Typhi 39.98151 34
SGB Carrier 46.65501 40
non-SGB Control_HealthySerosurvey 86.11299 40
non-SGB Acute_Typhi 60.01852 34
non-SGB Carrier 53.34503 40
# mwi_sgb_abundance <- mwi_sgb_abundance 
# do a scatter plot of the abundance of sgb in each group
my_comparisons <- list(c('Acute_Typhi', 'Carrier'), c('Acute_Typhi', 'Control_HealthySerosurvey'), c('Carrier', 'Control_HealthySerosurvey'))
mwi_sgb_abundance_plot <- ggplot(mwi_sgb_abundance %>% filter(SGB == 'SGB'), aes(x = Group, y = total_abundance, color = Group)) +
  geom_boxplot() +
  stat_compare_means(comparisons = my_comparisons) + 
  stat_compare_means(label.y = 70) + 
  theme(legend.position = "none") + 
  labs(y = 'SGB abundance in metagenome (%)', x = '') + 
  theme(text = element_text(size=18)) + 
  scale_x_discrete(labels= c('Control', 'Acute', 'Carrier'))

mwi_sgb_abundance_plot$layers[[2]]$aes_params$textsize <- 4
# which_layers(mwi_sgb_abundance_plot, "GeomSignif")

ggsave(file.path(metaphlan_input_folder, 'mwi_sgb_abundance.png'), mwi_sgb_abundance_plot, width = 8, height = 4.8, units = 'in', dpi = 300)
# heal_ac_car_taxonomic_all_w %>% filter(is.na(qval_Carrier))
to_get_list <- c('k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Oscillospiraceae.g__Faecalibacterium.s__Faecalibacterium_prausnitzii', 'k__Bacteria.p__Bacteroidetes.c__Bacteroidia.o__Bacteroidales.f__Barnesiellaceae.g__Barnesiella.s__Barnesiella_intestinihominis', 'k__Bacteria.p__Firmicutes.c__Bacilli.o__Lactobacillales.f__Lactobacillaceae.g__Limosilactobacillus.s__Limosilactobacillus_reuteri', 'k__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Pasteurellales.f__Pasteurellaceae.g__Haemophilus.s__Haemophilus_parainfluenzae', 'k__Bacteria.p__Firmicutes.c__Erysipelotrichia.o__Erysipelotrichales.f__Erysipelotrichaceae.g__Holdemanella.s__Holdemanella_biformis', 'k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Lachnospiraceae.g__Mediterraneibacter.s__Ruminococcus_gnavus', 'k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Peptostreptococcaceae.g__Paeniclostridium.s__Eubacterium_tenue', 'k__Bacteria.p__Firmicutes.c__Erysipelotrichia.o__Erysipelotrichales.f__Turicibacteraceae.g__Turicibacter.s__Turicibacter_bilis', 'k__Bacteria.p__Firmicutes.c__Erysipelotrichia.o__Erysipelotrichales.f__Turicibacteraceae.g__Turicibacter.s__Turicibacter_sanguinis', 'k__Bacteria.p__Bacteroidetes.c__Bacteroidia.o__Bacteroidales.f__Odoribacteraceae.g__Odoribacter.s__Odoribacter_splanchnicus', 'k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Lachnospiraceae.g__Blautia.s__Blautia_faecis', 'k__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Enterobacterales.f__Enterobacteriaceae.g__Escherichia.s__Escherichia_coli', 'k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Lachnospiraceae.g__Lachnospiraceae_unclassified.s__Eubacterium_rectale', 'k__Bacteria.p__Bacteroidetes.c__Bacteroidia.o__Bacteroidales.f__Bacteroidaceae.g__Bacteroides.s__Bacteroides_fragilis', 'k__Bacteria.p__Firmicutes.c__Erysipelotrichia.o__Erysipelotrichales.f__Erysipelotrichaceae.g__Holdemanella.s__Holdemanella_porci', 'k__Bacteria.p__Bacteroidetes.c__Bacteroidia.o__Bacteroidales.f__Prevotellaceae.g__Prevotellamassilia.s__Prevotellamassilia_timonensis')
# this one doesn't include the Oscillospiraceae, so will just have to point towards the supplementary table for those.
heal_acc_carr_table <- all_patterns %>% filter(feature %in% to_get_list)
write_csv(heal_acc_carr_table, file.path(maaslin_taxonomic_output_root_folder, 'blantyre.health_acute_carrier.health_acc_carr_table.csv'))